In [1]:
import pandas
import math
import json
from numpy.random import *
from matplotlib import style
style.use('fivethirtyeight')
%pylab inline
In [2]:
sitelinks = pandas.read_csv('snapshot_data/2014-10-13/property_indexes/site_linkss-index.csv',index_col=0).fillna(0)
In [3]:
sitelinks.head()
Out[3]:
The Wikidata Items that have no sitelinks
This means there are 24,454 humans in Wikidata not connected not in any other Wiki. We don't know the gender for 4,172. 4,027 of them are female, and 16,255 of them are male.
Now we will look at the wikis which have the most and least gender recordings for their human data
In [4]:
sitelinks['human_total'] = sitelinks.sum(axis=1)
sitelinks['gendered_total'] = sitelinks['human_total'] - sitelinks['nan']
sitelinks['gendered_per'] = sitelinks['gendered_total'] / sitelinks['human_total']
sitelinks['nonbin_total'] = sitelinks['gendered_total'] - sitelinks['female'] - sitelinks['male']
In [5]:
sitelinks[sitelinks['human_total'] > 10000].sort('gendered_per').head()
Out[5]:
In [5]:
In [6]:
sitelinks[sitelinks != 0]
Out[6]:
In [7]:
sitelinks.drop(float('nan'), inplace=True)
In [8]:
suffixes = set()
for bits in map(lambda x: x.split('wiki'), sitelinks.index):
if len(bits) == 3:
continue
else:
pre = bits[0]
suff = bits[1]
suffixes.add(suff)
In [9]:
suffixes
Out[9]:
In [10]:
def wikityper(wikiname):
suff = wikiname.split('wiki')[1]
return 'pedia' if suff == '' else suff
def wikilanger(wikiname):
pre = wikiname.split('wiki')[0]
return pre if pre else None
sitelinks['wikitype'] = map(wikityper, sitelinks.index)
sitelinks['wikilang'] = map(wikilanger, sitelinks.index)
sitelinks['fem_per'] = sitelinks['female'] / sitelinks['gendered_total']
sitelinks['nonbin_per'] = sitelinks['nonbin_total'] / sitelinks['gendered_total']
In [11]:
wikitypes = sitelinks.groupby(by='wikitype')
In [12]:
wikitypes.mean()[['female','male','fem_per','nonbin_per','gendered_total']]
Out[12]:
So this means that the female percentage is actually highest on Wikipedias in general at 16% where as for Wikiquote and Wikisource its only 8.5% and 4.7% respectively.
In [13]:
lang_map = json.load(open('helpers/wiki_code_map.json','r'))
def lookup_lang(lang):
try:
full= lang_map[lang]
if full.split()[-1].lower() == 'wikipedia':
return ' '.join(full.split()[:-1])
else: return full
except:
return lang
In [14]:
splitpoints = 5
for sort_term, sort_term_text in [('gendered_total', 'number of Gendered Biographies'), ('fem_per', 'percentage of female Biographies')]:
ssl = sitelinks[sitelinks['wikitype']=='pedia'].sort([sort_term])
planstep = len(ssl)/float(splitpoints)
for per_type, std_ylim, title_text in [('fem_per', 0.8, 'Female Composition'), ('nonbin_per',0.005, 'Non-binary Gender Percentage')]:
fig, axes = plt.subplots(nrows=splitpoints, ncols=1, figsize=(12,20))
plt.subplots_adjust(hspace = 0.8 )
for splitpoint in range(0,splitpoints):
begin = int(math.ceil(splitpoint * planstep))
end = int(math.floor((splitpoint+1) * planstep))
bios_list = ssl.iloc[begin:end]['gendered_total']
minbio = int(min(bios_list))
maxbio = int(max(bios_list))
ratios_list = ssl.iloc[begin:end][per_type]
maxratio = max(ratios_list)
minratio = min(ratios_list)
bios_size = bios_list.apply(lambda x: math.log(x)/math.log(maxbio))
my_colors = [(x, x/2, 0.75) for x in bios_size]
ssl.iloc[begin:end][per_type].plot(ax=axes[splitpoint], kind='bar', color=my_colors)
axes[splitpoint].set_title(" %s with %s to %s gendered biographies" % (title_text, minbio, maxbio))
axes[splitpoint].set_ylim((minratio*0.9,maxratio*1.1))
axes[splitpoint].grid(False)
axes[splitpoint].yaxis.grid(True, linestyle="--", linewidth=0.3)
axes[splitpoint].lines[0].set_visible(False)
axes[splitpoint].yaxis.set_ticks_position('none')
axes[splitpoint].xaxis.set_ticks_position('none')
wikilabels = axes[splitpoint].get_xticklabels()
wikinames = map(lambda x: x.get_text().split('wiki')[0], wikilabels)
fullnames = map(lookup_lang, wikinames)
axes[splitpoint].set_xticklabels(fullnames)
fig.suptitle("""%s of all Wikipedia Languages\n
ordered by %s. Color is locally relative Wiki Size""" % (title_text, sort_term_text), fontsize=24)
plt.show
In [15]:
sitelinks[sitelinks['wikitype']=='pedia'].sort(['gendered_total'])['gendered_total'].plot(figsize=(36,6),kind='bar', logy=True)
Out[15]:
In [29]:
#find the correct cut off so that we are only inspecting the top TOP wikis by gendered biographies
TOP = 50
scatdata = None
for vartotal in range(0, int(max(sitelinks['gendered_total']))):
scatdata = sitelinks[(sitelinks['wikitype']=='pedia') & (sitelinks['gendered_total'] > vartotal)]
if len(scatdata) > TOP:
continue
else:
print(vartotal)
break
nonbintot = None
for vartotal in range(0, int(max(sitelinks['gendered_total']))):
nonbintot= sitelinks[(sitelinks['wikitype']=='pedia') & (sitelinks['nonbin_total'] > vartotal)]
if len(nonbintot) > TOP:
continue
else:
print(vartotal)
break
In [31]:
scatdata.head()
Out[31]:
In [30]:
scatdata[['gendered_total','fem_per']].to_csv('Magnus Gender analysis/lang_scat.csv')
In [17]:
sp =scatdata.plot(kind='scatter', x='gendered_total', y='fem_per', logx=True, figsize=(16,10), c='#e3ae3d')
codes = map(lambda x: str(x).split('wiki')[0], scatdata.index)
fullnames = map(lookup_lang,codes)
sp.set_xlim(min(scatdata['gendered_total']) * 0.85, max(scatdata['gendered_total']) *1.15)
sp.set_ylim(min(scatdata['fem_per']) * 0.95, max(scatdata['fem_per']) *1.05)
sp.yaxis.set_major_formatter(matplotlib.ticker.FuncFormatter(lambda x, y: '{:.0%}'.format(x )))
sp.xaxis.set_major_formatter(matplotlib.ticker.FuncFormatter(lambda x, y: '{:,.0f}'.format(x )))
sp.set_title('Female ratio of biographies, by Wikipedia Language \nTop {} Wikipedias by Biography count\n'.format(TOP), fontsize=24)
sp.set_xlabel('Number of gendered Biographies, log scale', fontsize=18)
sp.set_ylabel('Female ratio of Biographies', fontsize=18)
(x1, x2), (y1, y2) = sp.get_xlim(), sp.get_ylim()
middle = (x2-x1)/2.0 , (y2-y1)/2.0
f = matplotlib.font_manager.FontProperties()
font1 = f.copy()
#font1.set_weight('light')
for label, x, y in zip(fullnames, scatdata['gendered_total'], scatdata['fem_per']):
plt.annotate(
label,
xy = (x, y),
xytext = (3,-3) if label in ['Latvian','Polish','Dutch','Slovak','Hungarian'] else (0,2),
textcoords = 'offset points', ha = 'center', va = 'bottom',
fontsize=8, fontproperties=font1)
In [18]:
np = scatdata.plot(kind='scatter', x='gendered_total', y='nonbin_per', logx=True, figsize=(16,10), c='#f34141')
codes = map(lambda x: str(x).split('wiki')[0], scatdata.index)
fullnames = map(lookup_lang,codes)
np.yaxis.set_major_formatter(matplotlib.ticker.FuncFormatter(lambda x, y: '{:.2%}'.format(x )))
np.xaxis.set_major_formatter(matplotlib.ticker.FuncFormatter(lambda x, y: '{:,.0f}'.format(x )))
np.set_xlim(min(scatdata['gendered_total']) * 0.85, max(scatdata['gendered_total']) *1.15)
np.set_ylim(min(scatdata['nonbin_per']) * 0.95, max(scatdata['nonbin_per']) *1.05)
np.set_title('Nonbinary ratio of biographies, by Wikipedia Language \nTop {} Wikipedias by Biography count\n'.format(TOP), fontsize=24)
np.set_xlabel('Number of gendered Biographies, log scale', fontsize=18)
np.set_ylabel('Nonbinary ratio of Biographies', fontsize=18)
f = matplotlib.font_manager.FontProperties()
font1 = f.copy()
#font1.set_weight('light')
for label, x, y in zip(fullnames, scatdata['gendered_total'], scatdata['nonbin_per']):
plt.annotate(
label,
xy = (x, y),
xytext = (5,-5) if label in ['Czech'] else (0,2),
textcoords = 'offset points', ha = 'center', va = 'bottom',
fontsize=8, fontproperties=font1)
plt.show()
'''
np = nonbintot.plot(kind='scatter', x='nonbin_total', y='nonbin_per', logx=True, figsize=(16,10))
codes = map(lambda x: str(x).split('wiki')[0], nonbintot.index)
fullnames = map(lookup_lang,codes)
np.set_xlim(min(nonbintot['nonbin_total']) * 0.85, max(nonbintot['nonbin_total']) *1.15)
np.set_ylim(min(nonbintot['nonbin_per']) * 0.95, max(nonbintot['nonbin_per']) *1.05)
np.set_title('Nonbinary percentage of biographies, by Wikipedia Language \nTop 50 Wikipedias by Biography count\n', fontsize=24)
np.set_xlabel('Number of gendered Biographies, log scale', fontsize=18)
np.set_ylabel('Non binary percentage of Biographies', fontsize=18)
for label, x, y in zip(fullnames, nonbintot['nonbin_total'], nonbintot['nonbin_per']):
plt.annotate(
label,
xy = (x, y), xytext = (0,2),
textcoords = 'offset points', ha = 'center', va = 'bottom',
fontsize=8)
plt.show()
'''
Out[18]:
In [19]:
from sklearn.decomposition import PCA
pca = PCA(n_components=2)
pca.fit(sitelinks[(sitelinks['wikitype']=='pedia')& (sitelinks['gendered_total']>1000)][['fem_per','gendered_total']])
Out[19]:
In [20]:
int(float(-6.2590421446e-09))
Out[20]:
In [21]:
from sklearn.decomposition import PCA
pca = PCA(n_components=1)
logtrans = pandas.DataFrame()
logtrans['nonbin_per'] = sitelinks[(sitelinks['wikitype']=='pedia') & (sitelinks['nonbin_per'] != 0)]['nonbin_per'].apply(math.log)
logtrans['gen_tot'] = sitelinks[(sitelinks['wikitype']=='pedia')& (sitelinks['nonbin_per'] != 0)]['gendered_total'].apply(math.log)
pca.fit(logtrans)
Out[21]:
In [22]:
print pca.components_, pca.explained_variance_ratio_
In [23]:
sitelinks[['gendered_total','female','male']].corr()
Out[23]: